---
title: "Analysis Code PP2025"
author: "Chloe Mortenson"
date: "2025-10-20"
output: html_document
---

 DEMOCRACY CONJOINT EXPERIMENT - DATA CLEANING AND ANALYSIS SCRIPT
# 
# Purpose: This script performs comprehensive data cleaning, Thurstone Case V 
#          scaling analysis, conjoint experiment data preparation, CJBART modeling,
#          and IMCE/VIMP calculations for a democracy preferences study.
#
# Author: [Chloe Mortenson & Dr.Erik Nisbet]
# Date: 10/20/25
# Version: 1.0
#
# Input: Raw Qualtrics survey data (CSV format)
# Output: Cleaned datasets, model objects, and analysis results (RDS files)

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

### Cleaning Conjoint Data and Running Thurstone Analysis 

```{r}
# Load required libraries
library(tidyr)
library(dplyr)
library(stringr)
library(irr)       # For Kendall's W
library(psych)     # For additional statistics
library(knitr)     # For tables
library(flextable) # For Word-friendly tables
library(survival)  # For conditional logit model

# Helper function for safe numeric conversion
safe_as_numeric <- function(x) {
  as.numeric(as.character(x))
}

# Read the CSV file
dt <- read.csv("/your-directory-here/Understanding Democracy Final Version_August 16, 2024_13.32.csv", header=TRUE)
dt <- dt %>% slice(-1, -2)

# Identify relevant columns
understanding_dem_cols <- grep("^understanding_dem_", names(dt), value=TRUE)
attent_cols <- grep("^attent_", names(dt), value=TRUE)
choice_cols <- c("C1", "C2", "C3", "C4", "C5")
feature_cols <- grep("^feature\\d+\\.\\d+\\.\\d+_CBCONJOINT$", names(dt), value=TRUE)
id_col <- "ResponseId"
z_vars <- c("age", "educ", "idpol", "idsoc", "eideo", "race")

# Define the items in each dimension
distributive_items <- c("understanding_dem_4", "understanding_dem_5", "understanding_dem_6")
procedural_items <- c("understanding_dem_7", "understanding_dem_8", "understanding_dem_9")
liberal_items <- c("understanding_dem_11", "understanding_dem_12", "understanding_dem_13")

# Convert variables to numeric
dt <- dt %>%
  mutate(across(all_of(z_vars), safe_as_numeric),
         across(all_of(attent_cols), safe_as_numeric),
         across(all_of(understanding_dem_cols), safe_as_numeric))

# Transform race into binary variable and calculate other variables
dt <- dt %>%
  mutate(
    polint = rowMeans(dt[, attent_cols], na.rm = TRUE),
    race_binary = ifelse(race == 1, 1, 0),
    ideo = rowMeans(dt[, c("idpol", "idsoc", "eideo")], na.rm = TRUE)
  )

# Create subsets for each dimension
distributive_data <- dt %>% select(all_of(distributive_items)) %>% na.omit()
procedural_data <- dt %>% select(all_of(procedural_items)) %>% na.omit()
liberal_data <- dt %>% select(all_of(liberal_items)) %>% na.omit()

#-------------------------------------------------------------------------------
# Thurstone Case V Analysis
#-------------------------------------------------------------------------------

# Function for Thurstone Case V analysis with additional fit statistics
thurstone_case_v <- function(data) {
  # Calculate Kendall's W with and without correction for ties
  kendalls_w_corrected <- kendall(as.matrix(data), correct = TRUE)$value
  kendalls_w_uncorrected <- kendall(as.matrix(data), correct = FALSE)$value
  
  # Number of items and subjects
  n_items <- ncol(data)
  n_subjects <- nrow(data)
  item_names <- names(data)
  
  # Initialize matrices
  freq_matrix <- matrix(0, nrow=n_items, ncol=n_items)
  prop_matrix <- matrix(0.5, nrow=n_items, ncol=n_items)
  rownames(freq_matrix) <- rownames(prop_matrix) <- item_names
  colnames(freq_matrix) <- colnames(prop_matrix) <- item_names
  
  # Fill preference matrices
  for (i in 1:n_subjects) {
    for (j in 1:n_items) {
      for (k in 1:n_items) {
        if (j != k) {
          # Lower value = higher preference (for rankings)
          if (data[i, j] < data[i, k]) {
            freq_matrix[j, k] <- freq_matrix[j, k] + 1
          }
        }
      }
    }
  }
  
  # Calculate proportions with Laplace smoothing
  for (j in 1:n_items) {
    for (k in 1:n_items) {
      if (j != k) {
        total_comparisons <- freq_matrix[j, k] + freq_matrix[k, j]
        prop_matrix[j, k] <- (freq_matrix[j, k] + 0.5) / (total_comparisons + 1)
        prop_matrix[k, j] <- 1 - prop_matrix[j, k]
      }
    }
  }
  
  # Convert to z-scores
  z_matrix <- qnorm(prop_matrix)
  diag(z_matrix) <- 0
  
  # Handle extreme values
  z_matrix[is.infinite(z_matrix)] <- ifelse(z_matrix[is.infinite(z_matrix)] > 0, 4.0, -4.0)
  
  # Calculate scale values
  scale_values <- rowMeans(z_matrix, na.rm = TRUE)
  
  # Center the scale values
  scale_values <- scale_values - mean(scale_values)
  
  # Calculate standard errors
  se_values <- apply(z_matrix, 1, function(x) sd(x, na.rm = TRUE) / sqrt(n_items - 1))
  
  # Calculate model fit statistics
  
  # 1. Stress (Kruskal's stress formula 1) - lower is better
  # Adapted for Thurstone model - measures discrepancy between observed proportions and model-implied proportions
  model_prop <- pnorm(outer(scale_values, scale_values, "-"))
  diag(model_prop) <- 0.5
  stress <- sqrt(sum((prop_matrix - model_prop)^2) / sum((prop_matrix - mean(prop_matrix))^2))
  
  # 2. Calculate R-squared - higher is better
  # Proportion of variance in observed proportions explained by the model
  ss_total <- sum((prop_matrix - mean(prop_matrix))^2)
  ss_residual <- sum((prop_matrix - model_prop)^2)
  r_squared <- 1 - (ss_residual / ss_total)
  
  # 3. RMSE - Root Mean Square Error - lower is better
  rmse <- sqrt(mean((prop_matrix - model_prop)^2))
  
  # 4. Mean Absolute Error - lower is better
  mae <- mean(abs(prop_matrix - model_prop))
  
  # 5. Maximum absolute discrepancy
  max_disc <- max(abs(prop_matrix - model_prop))
  
  # Create results table
  results_df <- data.frame(
    Item = item_names,
    Scale_Value = scale_values,
    SE = se_values
  )
  
  # Sort by scale value
  results_df <- results_df %>% arrange(desc(Scale_Value))
  
  # Return all results
  return(list(
    scale_values = results_df,
    n_subjects = n_subjects,
    kendalls_w_corrected = kendalls_w_corrected,
    kendalls_w_uncorrected = kendalls_w_uncorrected,
    stress = stress,
    r_squared = r_squared,
    rmse = rmse,
    mae = mae,
    max_discrepancy = max_disc,
    matrices = list(freq = freq_matrix, prop = prop_matrix, z = z_matrix, model_prop = model_prop)
  ))
}

# Run analysis for each dimension
distributive_results <- thurstone_case_v(distributive_data)
procedural_results <- thurstone_case_v(procedural_data)
liberal_results <- thurstone_case_v(liberal_data)

# Create a comprehensive fit statistics table
fit_stats_table <- data.frame(
  Dimension = c("Distributive", "Procedural", "Liberal"),
  N = c(
    distributive_results$n_subjects,
    procedural_results$n_subjects,
    liberal_results$n_subjects
  ),
  `Kendalls_W_Corrected` = c(
    distributive_results$kendalls_w_corrected,
    procedural_results$kendalls_w_corrected,
    liberal_results$kendalls_w_corrected
  ),
  `Kendalls_W_Uncorrected` = c(
    distributive_results$kendalls_w_uncorrected,
    procedural_results$kendalls_w_uncorrected,
    liberal_results$kendalls_w_uncorrected
  ),
  `R_Squared` = c(
    distributive_results$r_squared,
    procedural_results$r_squared,
    liberal_results$r_squared
  ),
  Stress = c(
    distributive_results$stress,
    procedural_results$stress,
    liberal_results$stress
  ),
  RMSE = c(
    distributive_results$rmse,
    procedural_results$rmse,
    liberal_results$rmse
  ),
  MAE = c(
    distributive_results$mae,
    procedural_results$mae,
    liberal_results$mae
  ),
  `Max_Discrepancy` = c(
    distributive_results$max_discrepancy,
    procedural_results$max_discrepancy,
    liberal_results$max_discrepancy
  )
)

# Rename the columns for better headings in Word
names(fit_stats_table) <- c("Dimension", "N", "Kendall's W (corrected)", 
                            "Kendall's W (uncorrected)", "R²", "Stress", 
                            "RMSE", "MAE", "Max Discrepancy")

# Create a table with scale values for each dimension separately
distributive_values <- distributive_results$scale_values %>%
  rename(`Scale Value` = Scale_Value)

procedural_values <- procedural_results$scale_values %>%
  rename(`Scale Value` = Scale_Value)

liberal_values <- liberal_results$scale_values %>%
  rename(`Scale Value` = Scale_Value)

# Save the Thurstone results for later use
save(distributive_results, procedural_results, liberal_results, 
     file = "thurstone_complete_results.RData")

# Create Word-friendly tables with flextable
if (requireNamespace("flextable", quietly = TRUE) && 
    requireNamespace("officer", quietly = TRUE)) {
  
  library(officer)
  
  # Function to create nicely formatted flextable
  create_nice_flextable <- function(df, caption) {
    ft <- flextable(df) %>%
      set_caption(caption) %>%
      autofit() %>%
      theme_box() %>%
      fontsize(size = 10) %>%
      bold(part = "header") %>%
      bg(bg = "#f7f7f7", part = "header")
    
    return(ft)
  }
  
  # Create fit statistics table
  fit_ft <- create_nice_flextable(fit_stats_table, 
                                 "Table 1: Fit Statistics for Thurstone Case V Analysis")
  
  # Format numbers in the fit statistics table
  fit_ft <- fit_ft %>%
    colformat_double(j = 3:9, digits = 4)
  
  # Create scale values tables
  dist_ft <- create_nice_flextable(distributive_values, 
                                   "Table 2: Thurstone Scale Values - Distributive Democracy")
  
  proc_ft <- create_nice_flextable(procedural_values, 
                                   "Table 3: Thurstone Scale Values - Procedural Democracy")
  
  lib_ft <- create_nice_flextable(liberal_values, 
                                  "Table 4: Thurstone Scale Values - Liberal Democracy")
  
  # Format scale values tables
  dist_ft <- dist_ft %>% colformat_double(j = 2:3, digits = 4)
  proc_ft <- proc_ft %>% colformat_double(j = 2:3, digits = 4)
  lib_ft <- lib_ft %>% colformat_double(j = 2:3, digits = 4)
  
  # Create a docx file with all tables
  doc <- read_docx()
  doc <- body_add_par(doc, "Thurstone Case V Analysis Results", style = "heading 1")
  doc <- body_add_par(doc, "")
  
  doc <- body_add_par(doc, "Fit Statistics", style = "heading 2")
  doc <- body_add_par(doc, "Table 1 shows the fit statistics and agreement measures for each dimension of democracy. Kendall's W measures the level of agreement among respondents, with higher values indicating stronger consensus. The fit statistics (R², Stress, RMSE, MAE) indicate how well the Thurstone Case V model fits the data.")
  doc <- body_add_flextable(doc, value = fit_ft)
  doc <- body_add_par(doc, "")
  
  doc <- body_add_par(doc, "Interpreting Kendall's W:", style = "heading 3")
  doc <- body_add_par(doc, "< 0.2: Very weak agreement")
  doc <- body_add_par(doc, "0.2-0.4: Weak agreement")
  doc <- body_add_par(doc, "0.4-0.6: Moderate agreement")
  doc <- body_add_par(doc, "0.6-0.8: Strong agreement")
  doc <- body_add_par(doc, "> 0.8: Very strong agreement")
  doc <- body_add_par(doc, "")
  
  doc <- body_add_par(doc, "Scale Values by Dimension", style = "heading 2")
  doc <- body_add_par(doc, "The following tables show the Thurstone scale values for each item within the three democracy dimensions. Higher scale values indicate greater importance or preference.")
  doc <- body_add_par(doc, "")
  
  doc <- body_add_par(doc, "Distributive Democracy", style = "heading 3")
  doc <- body_add_flextable(doc, value = dist_ft)
  doc <- body_add_par(doc, "")
  
  doc <- body_add_par(doc, "Procedural Democracy", style = "heading 3")
  doc <- body_add_flextable(doc, value = proc_ft)
  doc <- body_add_par(doc, "")
  
  doc <- body_add_par(doc, "Liberal Democracy", style = "heading 3")
  doc <- body_add_flextable(doc, value = lib_ft)
  
  # Save the document
  print(doc, target = "Democracy_Thurstone_Results.docx")
  cat("Word document created: Democracy_Thurstone_Results.docx\n")
}

# Also save as CSV files for easy import to Word if needed
write.csv(fit_stats_table, "thurstone_fit_statistics.csv", row.names = FALSE)
write.csv(distributive_values, "distributive_scale_values.csv", row.names = FALSE)
write.csv(procedural_values, "procedural_scale_values.csv", row.names = FALSE)
write.csv(liberal_values, "liberal_scale_values.csv", row.names = FALSE)

cat("CSV files saved for easy import to Word\n")

#-------------------------------------------------------------------------------
# Conjoint Experiment Analysis
#-------------------------------------------------------------------------------

# Parameters for the conjoint experiment
no.tasks <- 5
no.prof <- 2

# Get unique attributes (features)
attrib <- unique(dt$feature0.DISPLAY_NAME)
attrib <- attrib[attrib != ""]
if (length(attrib) == 0) {
  stop("No valid feature names found in feature0.DISPLAY_NAME")
}
cat("Attributes found:", paste(attrib, collapse = ", "), "\n")

# Create the storage dataframe
st <- expand.grid(ResponseId = unique(dt$ResponseId),
                  task = 1:no.tasks,
                  profile = 1:no.prof,
                  stringsAsFactors = FALSE)
st$selected <- NA

# Create columns for attributes and their row positions
for (attr in attrib) {
  st[[attr]] <- NA
  st[[paste(attr, "rowpos", sep = ".")]] <- NA
}

# Loop through respondents
for (i in seq_len(nrow(dt))) {
  if (i %% 100 == 0) cat("Processing row", i, "of", nrow(dt), "\n")
  
  # Loop through tasks
  for (t in 1:no.tasks) {
    
    # Loop through profiles
    for (p in 1:no.prof) {
      
      # Subsetter
      sub <- st$ResponseId == dt$ResponseId[i] & st$task == t & st$profile == p
      
      # Which profile was selected (1 or 2)
      st$selected[sub] <- as.numeric(dt[i, choice_cols[t]] == p)
      
      # Loop through attributes
      for (a in seq_len(length(attrib))) {
        for (aa in seq_len(length(attrib))) {
          
          # Check which attribute
          feature_col <- paste0("feature", a-1, ".", t, ".", p, "_CBCONJOINT")
          display_name_col <- paste0("feature", a-1, ".DISPLAY_NAME")
          
          if (dt[i, display_name_col] == attrib[aa]) {
            # Attribute row position
            st[sub, paste(attrib[aa], "rowpos", sep = ".")] <- a
            # Attribute values
            st[sub, attrib[aa]] <- dt[i, feature_col]
          }
        }
      }
      
    } # End loop over profiles
  } # End loop over tasks
} # End loop over respondents

# Ensure ResponseId columns match format
st <- st %>% mutate(ResponseId = as.character(ResponseId))

#-------------------------------------------------------------------------------
# Integrate Thurstone Case V Scores with Conjoint Data
#-------------------------------------------------------------------------------

# Create a data frame with Thurstone scores for each item
thurstone_scores <- bind_rows(
  distributive_results$scale_values %>% mutate(Dimension = "Distributive"),
  procedural_results$scale_values %>% mutate(Dimension = "Procedural"),
  liberal_results$scale_values %>% mutate(Dimension = "Liberal")
)

# Map the item names to more descriptive labels if needed
item_labels <- c(
  "understanding_dem_4" = "Distributive_Item_4",
  "understanding_dem_5" = "Distributive_Item_5",
  "understanding_dem_6" = "Distributive_Item_6",
  "understanding_dem_7" = "Procedural_Item_7",
  "understanding_dem_8" = "Procedural_Item_8",
  "understanding_dem_9" = "Procedural_Item_9",
  "understanding_dem_11" = "Liberal_Item_11",
  "understanding_dem_12" = "Liberal_Item_12",
  "understanding_dem_13" = "Liberal_Item_13"
)

# Add the labels to the Thurstone scores
thurstone_scores$Item_Label <- item_labels[thurstone_scores$Item]

# Create respondent-level Thurstone preference scores
# These will be used to analyze how individual preferences affect conjoint choices

# Merge Thurstone data with respondent data
# First, create a respondent-level dataset with the items
respondent_items <- dt %>%
  select(ResponseId, all_of(names(item_labels))) %>%
  distinct()

# Add dimension preference scores for each respondent
# Calculate weighted average for each dimension using Thurstone scale values as weights
dimension_scores <- respondent_items %>%
  rowwise() %>%
  mutate(
    # Distributive dimension score
    Distributive_Score = sum(c(
      understanding_dem_4 * thurstone_scores$Scale_Value[thurstone_scores$Item == "understanding_dem_4"],
      understanding_dem_5 * thurstone_scores$Scale_Value[thurstone_scores$Item == "understanding_dem_5"],
      understanding_dem_6 * thurstone_scores$Scale_Value[thurstone_scores$Item == "understanding_dem_6"]
    ), na.rm = TRUE),
    
    # Procedural dimension score
    Procedural_Score = sum(c(
      understanding_dem_7 * thurstone_scores$Scale_Value[thurstone_scores$Item == "understanding_dem_7"],
      understanding_dem_8 * thurstone_scores$Scale_Value[thurstone_scores$Item == "understanding_dem_8"],
      understanding_dem_9 * thurstone_scores$Scale_Value[thurstone_scores$Item == "understanding_dem_9"]
    ), na.rm = TRUE),
    
    # Liberal dimension score
    Liberal_Score = sum(c(
      understanding_dem_11 * thurstone_scores$Scale_Value[thurstone_scores$Item == "understanding_dem_11"],
      understanding_dem_12 * thurstone_scores$Scale_Value[thurstone_scores$Item == "understanding_dem_12"],
      understanding_dem_13 * thurstone_scores$Scale_Value[thurstone_scores$Item == "understanding_dem_13"]
    ), na.rm = TRUE)
  ) %>%
  ungroup()

# Add dimension preference ranking (high to low) for each respondent
dimension_scores <- dimension_scores %>%
  rowwise() %>%
  mutate(
    Preferred_Dimension = case_when(
      Distributive_Score >= Procedural_Score & Distributive_Score >= Liberal_Score ~ "Distributive",
      Procedural_Score >= Distributive_Score & Procedural_Score >= Liberal_Score ~ "Procedural",
      Liberal_Score >= Distributive_Score & Liberal_Score >= Procedural_Score ~ "Liberal",
      TRUE ~ NA_character_
    ),
    Second_Dimension = case_when(
      Preferred_Dimension == "Distributive" & Procedural_Score >= Liberal_Score ~ "Procedural",
      Preferred_Dimension == "Distributive" & Liberal_Score > Procedural_Score ~ "Liberal",
      Preferred_Dimension == "Procedural" & Distributive_Score >= Liberal_Score ~ "Distributive",
      Preferred_Dimension == "Procedural" & Liberal_Score > Distributive_Score ~ "Liberal",
      Preferred_Dimension == "Liberal" & Distributive_Score >= Procedural_Score ~ "Distributive",
      Preferred_Dimension == "Liberal" & Procedural_Score > Distributive_Score ~ "Procedural",
      TRUE ~ NA_character_
    ),
    Least_Dimension = case_when(
      Preferred_Dimension != "Distributive" & Second_Dimension != "Distributive" ~ "Distributive",
      Preferred_Dimension != "Procedural" & Second_Dimension != "Procedural" ~ "Procedural",
      Preferred_Dimension != "Liberal" & Second_Dimension != "Liberal" ~ "Liberal",
      TRUE ~ NA_character_
    )
  ) %>%
  ungroup()

# Create a demographic variables dataset
demographic_data <- dt %>%
  select(ResponseId, all_of(z_vars), race_binary, polint, ideo) %>%
  distinct()

# Now merge with the conjoint data - include both dimension scores and demographics
st <- st %>%
  left_join(select(dimension_scores, ResponseId, 
                  Distributive_Score, Procedural_Score, Liberal_Score,
                  Preferred_Dimension, Second_Dimension, Least_Dimension), 
            by = "ResponseId") %>%
  left_join(demographic_data, by = "ResponseId")


# Print a summary of the enhanced dataset
cat("\nConjoint dataset enhanced with Thurstone preference scores and demographics\n")
cat("Number of respondents:", length(unique(st$ResponseId)), "\n")
cat("Number of observations:", nrow(st), "\n")
cat("New variables added:\n")
cat("  - Thurstone scores: Distributive_Score, Procedural_Score, Liberal_Score\n")
cat("  - Preference rankings: Preferred_Dimension, Second_Dimension, Least_Dimension\n")
cat("  - Demographics:", paste(z_vars, collapse=", "), "\n")
cat("  - Additional variables: race_binary, polint, ideo\n")

# Sort the resulting dataframe
st <- st %>% arrange(ResponseId, task, profile)

# Save the enhanced dataset
write.csv(st, "conjoint_with_thurstone_preferences.csv", row.names = FALSE)
saveRDS(st, "conjoint_with_thurstone_preferences.rds")

# Check for any missing data
missing_data <- st %>%
  summarise(across(everything(), ~sum(is.na(.))))
cat("\n\nMissing data summary:\n")
print(missing_data)

# Save the complete workspace for future reference
save.image("democracy_thurstone_conjoint_complete.RData")

cat("\nAnalysis completed. All results saved.\n")
```


#### Data Structuring : 
```{r}


# Save the enhanced dataset
write.csv(st, "conjoint_with_thurstone_preferences.csv", row.names = FALSE)
saveRDS(st, "conjoint_with_thurstone_preferences.rds")

cat("\n=== CREATING CORRECTED MODEL DATASET ===\n")

# Define attributes for relabeling
attributes <- c("Freedom of the Press and Speech", 
                "Role of the President and Elections", 
                "Rule of Law", 
                "Economic Wellbeing")

# Get the actual unique values from the data for Economic Wellbeing
econ_values <- unique(st$`Economic Wellbeing`)
econ_values <- econ_values[!is.na(econ_values)]

cat("Actual Economic Wellbeing values in data:\n")
print(econ_values)

# Create correct mapping that uses the ACTUAL strings from the data
correct_econ_mapping <- setNames(
  c("Normative", "Majority", "Minority"),
  c(
    econ_values[grepl("Everyone.*NEEDS", econ_values)],  # Finds the smart apostrophe version
    econ_values[grepl("WEALTHY", econ_values)],
    econ_values[grepl("POOR", econ_values)]
  )
)

cat("\nCorrect Economic Wellbeing mapping:\n")
print(correct_econ_mapping)

# Original mapping for other attributes (these worked fine)
comprehensive_mapping <- c(
  "The media is FREE AND FAIR to all political views and there is NO censorship" = "Normative",
  "The media BIASED in FAVOR your political views and speech you OPPOSE is CONTROLLED." = "Majority",
  "The media is BIASED AGAINST your political views and speech you SUPPORT is CONTROLLED." = "Minority",
  "There are MANY checks and balances on the president; elections are FREE AND FAIR for all parties." = "Normative",
  "Presidents you SUPPORT are MOSTLY UNCHECKED; election rules BENEFIT the political party you support." = "Majority",
  "Presidents you OPPOSE are MOSTLY UNCHECKED; election rules DISADVANTAGE the political party you support." = "Minority",
  "Judicial decisions are IMPARTIAL and the rule of law is EQUALLY APPLIED to all" = "Normative",
  "Judicial decisions and the rule of law FAVOR people like you." = "Majority",
  "Judicial decisions and the rule of law DISADVANTAGE people like you" = "Minority"
)

# Create corrected dataset
corrected_model_data <- st

# Apply correct mapping for each attribute
for (attr in attributes) {
  if(attr == "Economic Wellbeing") {
    # Use the correct mapping for Economic Wellbeing
    corrected_model_data[[attr]] <- factor(
      correct_econ_mapping[as.character(corrected_model_data[[attr]])],
      levels = c("Normative", "Majority", "Minority")
    )
  } else {
    # Apply original mapping for other attributes
    original_values <- unique(corrected_model_data[[attr]])
    mismatches <- setdiff(original_values, names(comprehensive_mapping))
    temp_mapping <- comprehensive_mapping
    
    if (length(mismatches) > 0) {
      for (mismatch in mismatches) {
        temp_mapping[mismatch] <- "Unknown"
      }
    }
    
    new_values <- temp_mapping[as.character(corrected_model_data[[attr]])]
    corrected_model_data[[attr]] <- factor(new_values, 
                                           levels = c("Normative", "Majority", "Minority", "Unknown"))
  }
}

# Verify the correction worked
cat("\n=== VERIFICATION OF CORRECTION ===\n")
for(attr in attributes) {
  cat("\n", attr, " distribution:\n")
  print(table(corrected_model_data[[attr]], useNA = "always"))
}

# Prepare corrected CJBART data
corrected_cjbart_data <- corrected_model_data %>%
  mutate(across(all_of(attributes), ~factor(., levels = c("Normative", "Majority", "Minority")))) %>%
  mutate(selected = as.integer(selected)) %>%
  drop_na()

cat("\n=== CORRECTED CJBART DATA SUMMARY ===\n")
cat("Total observations:", nrow(corrected_cjbart_data), "\n")
cat("Respondents:", length(unique(corrected_cjbart_data$ResponseId)), "\n")

# Verify Economic Wellbeing has all three levels
econ_check <- table(corrected_cjbart_data$`Economic Wellbeing`)
cat("\nEconomic Wellbeing distribution in corrected CJBART data:\n")
print(econ_check)

if(econ_check["Normative"] > 0) {
  cat("\n✅ SUCCESS: Economic Wellbeing now has Normative observations!\n")
  cat("Normative count:", econ_check["Normative"], "\n")
} else {
  cat("\n❌ WARNING: Economic Wellbeing still has no Normative observations!\n")
}

# Save the corrected datasets
saveRDS(corrected_model_data, "corrected_model_data.rds")
saveRDS(corrected_cjbart_data, "corrected_cjbart_data.rds")
write.csv(corrected_cjbart_data, "corrected_cjbart_data.csv", row.names = FALSE)

cat("\n=== CORRECTED DATASETS SAVED ===\n")
cat("Files created:\n")
cat("  - corrected_model_data.rds (full dataset with corrected labels)\n")
cat("  - corrected_cjbart_data.rds (CJBART-ready dataset)\n")
cat("  - corrected_cjbart_data.csv (for inspection)\n")

#-------------------------------------------------------------------------------
# Continue with existing code...
#-------------------------------------------------------------------------------

# Check for any missing data
missing_data <- st %>%
  summarise(across(everything(), ~sum(is.na(.))))
cat("\n\nMissing data summary:\n")
print(missing_data)

# Save the complete workspace for future reference
save.image("democracy_thurstone_conjoint_complete.RData")

cat("\nAnalysis completed. All results saved.\n")
```


#### Mean Check for Thurstone Case V Scores: 
```{r}

# Add this code after the Thurstone Case V analyses have been run
# (after the section where distributive_results, procedural_results, and liberal_results are created)

#-------------------------------------------------------------------------------
# Calculate Overall Mean Scores for Each Dimension
#-------------------------------------------------------------------------------

# Calculate simple mean scores for each dimension (across the entire sample)
distributive_mean <- mean(rowMeans(distributive_data, na.rm = TRUE), na.rm = TRUE)
procedural_mean <- mean(rowMeans(procedural_data, na.rm = TRUE), na.rm = TRUE)
liberal_mean <- mean(rowMeans(liberal_data, na.rm = TRUE), na.rm = TRUE)

# Calculate mean scores for individual items (across the entire sample)
distributive_item_means <- colMeans(distributive_data, na.rm = TRUE)
procedural_item_means <- colMeans(procedural_data, na.rm = TRUE)
liberal_item_means <- colMeans(liberal_data, na.rm = TRUE)

# Create a summary table of dimension means
dimension_means_table <- data.frame(
  Dimension = c("Distributive", "Procedural", "Liberal"),
  Mean_Score = c(distributive_mean, procedural_mean, liberal_mean),
  N = c(
    nrow(distributive_data),
    nrow(procedural_data),
    nrow(liberal_data)
  )
)

# Create item-level mean tables for each dimension
distributive_items_table <- data.frame(
  Item = names(distributive_item_means),
  Mean_Score = distributive_item_means,
  Scale_Value = distributive_results$scale_values$Scale_Value[match(names(distributive_item_means), distributive_results$scale_values$Item)]
) %>% arrange(desc(Scale_Value))

procedural_items_table <- data.frame(
  Item = names(procedural_item_means),
  Mean_Score = procedural_item_means,
  Scale_Value = procedural_results$scale_values$Scale_Value[match(names(procedural_item_means), procedural_results$scale_values$Item)]
) %>% arrange(desc(Scale_Value))

liberal_items_table <- data.frame(
  Item = names(liberal_item_means),
  Mean_Score = liberal_item_means,
  Scale_Value = liberal_results$scale_values$Scale_Value[match(names(liberal_item_means), liberal_results$scale_values$Item)]
) %>% arrange(desc(Scale_Value))

# Print the results
cat("\n----- Overall Mean Scores for Democracy Dimensions -----\n")
print(dimension_means_table)

cat("\n----- Distributive Democracy Items -----\n")
print(distributive_items_table)

cat("\n----- Procedural Democracy Items -----\n")
print(procedural_items_table)

cat("\n----- Liberal Democracy Items -----\n")
print(liberal_items_table)

# Create Word-friendly tables for dimension means if flextable is available
if (requireNamespace("flextable", quietly = TRUE) && 
    requireNamespace("officer", quietly = TRUE)) {
  
  # Create dimension means table
  means_ft <- create_nice_flextable(dimension_means_table, 
                                  "Table 5: Overall Mean Scores for Democracy Dimensions")
  
  # Format numbers
  means_ft <- means_ft %>% colformat_double(j = 2, digits = 4)
  
  # Create item-level tables
  dist_means_ft <- create_nice_flextable(distributive_items_table, 
                                    "Table 6: Distributive Democracy - Item Means and Scale Values")
  
  proc_means_ft <- create_nice_flextable(procedural_items_table, 
                                    "Table 7: Procedural Democracy - Item Means and Scale Values")
  
  lib_means_ft <- create_nice_flextable(liberal_items_table, 
                                   "Table 8: Liberal Democracy - Item Means and Scale Values")
  
  # Format item tables
  dist_means_ft <- dist_means_ft %>% colformat_double(j = 2:3, digits = 4)
  proc_means_ft <- proc_means_ft %>% colformat_double(j = 2:3, digits = 4)
  lib_means_ft <- lib_means_ft %>% colformat_double(j = 2:3, digits = 4)
  
  # Add to the existing document
  doc <- body_add_par(doc, "Dimension Mean Scores", style = "heading 2")
  doc <- body_add_par(doc, "The following table shows the overall mean scores for each democracy dimension across all respondents.")
  doc <- body_add_flextable(doc, value = means_ft)
  doc <- body_add_par(doc, "")
  
  doc <- body_add_par(doc, "Item-Level Mean Scores", style = "heading 2")
  doc <- body_add_par(doc, "These tables show the mean scores for each item along with their Thurstone scale values.")
  doc <- body_add_par(doc, "")
  
  doc <- body_add_par(doc, "Distributive Democracy Items", style = "heading 3")
  doc <- body_add_flextable(doc, value = dist_means_ft)
  doc <- body_add_par(doc, "")
  
  doc <- body_add_par(doc, "Procedural Democracy Items", style = "heading 3")
  doc <- body_add_flextable(doc, value = proc_means_ft)
  doc <- body_add_par(doc, "")
  
  doc <- body_add_par(doc, "Liberal Democracy Items", style = "heading 3")
  doc <- body_add_flextable(doc, value = lib_means_ft)
  
  # Save the updated document
  print(doc, target = "Democracy_Thurstone_Results_with_Means.docx")
  cat("Updated Word document created: Democracy_Thurstone_Results_with_Means.docx\n")
}

# Save mean scores as CSV files
write.csv(dimension_means_table, "dimension_overall_means.csv", row.names = FALSE)
write.csv(distributive_items_table, "distributive_items_means.csv", row.names = FALSE)
write.csv(procedural_items_table, "procedural_items_means.csv", row.names = FALSE)
write.csv(liberal_items_table, "liberal_items_means.csv", row.names = FALSE)

cat("CSV files with mean scores saved\n")

# Update the workspace
save.image("democracy_thurstone_conjoint_complete_with_means.RData")

cat("\nAnalysis updated with overall mean scores for dimensions. All results saved.\n")
```


### Running CJ-Bart Analysis, AMCE and IMCE Analysis 
```{r}
#### Running cjbart model, IMCEs and AMCES

library(dplyr)
library(cregg)
library(cjbart)

# Load the corrected data
model_data <- readRDS("corrected_model_data.rds")

cat("=== COMPREHENSIVE FIX FOR STATISTICAL ISSUES ===\n")

#===============================================================================
# STEP 1: Create Unique Factor Level Names (Fixes both AMCE and IMCE)
#===============================================================================

cat("Step 1: Creating unique factor level names...\n")

# Create clean, unique factor names for each attribute
model_data_fixed <- model_data %>%
  mutate(
    # Economic Wellbeing -> Econ_*
    Econ_Wellbeing = factor(
      case_when(
        `Economic Wellbeing` == "Normative" ~ "Econ_Normative",
        `Economic Wellbeing` == "Majority" ~ "Econ_Majority", 
        `Economic Wellbeing` == "Minority" ~ "Econ_Minority"
      ),
      levels = c("Econ_Normative", "Econ_Majority", "Econ_Minority")
    ),
    
    # Freedom of Press -> Press_*
    Press_Freedom = factor(
      case_when(
        `Freedom of the Press and Speech` == "Normative" ~ "Press_Normative",
        `Freedom of the Press and Speech` == "Majority" ~ "Press_Majority",
        `Freedom of the Press and Speech` == "Minority" ~ "Press_Minority"
      ),
      levels = c("Press_Normative", "Press_Majority", "Press_Minority")
    ),
    
    # Rule of Law -> Law_*
    Rule_Law = factor(
      case_when(
        `Rule of Law` == "Normative" ~ "Law_Normative",
        `Rule of Law` == "Majority" ~ "Law_Majority",
        `Rule of Law` == "Minority" ~ "Law_Minority"
      ),
      levels = c("Law_Normative", "Law_Majority", "Law_Minority")
    ),
    
    # Role of President -> Pres_*
    Presidential_Role = factor(
      case_when(
        `Role of the President and Elections` == "Normative" ~ "Pres_Normative",
        `Role of the President and Elections` == "Majority" ~ "Pres_Majority",
        `Role of the President and Elections` == "Minority" ~ "Pres_Minority"
      ),
      levels = c("Pres_Normative", "Pres_Majority", "Pres_Minority")
    )
  )

# Verify unique factor levels
cat("Verifying unique factor levels:\n")
new_attributes <- c("Econ_Wellbeing", "Press_Freedom", "Rule_Law", "Presidential_Role")

for(attr in new_attributes) {
  cat(attr, ":", paste(levels(model_data_fixed[[attr]]), collapse = ", "), "\n")
}

#===============================================================================
# STEP 2: Fix Data Structure for IMCE (Address Covariate Issues)
#===============================================================================

cat("\nStep 2: Fixing data structure for IMCE...\n")

# Create respondent-level demographics (constant within person)
respondent_demographics <- model_data_fixed %>%
  group_by(ResponseId) %>%
  summarise(
    # Take the first non-NA value for each demographic (should be same for all rows)
    age_const = first(age[!is.na(age)]),
    educ_const = first(educ[!is.na(educ)]),
    ideo_const = first(ideo[!is.na(ideo)]),
    polint_const = first(polint[!is.na(polint)]),
    race_binary_const = first(race_binary[!is.na(race_binary)]),
    
    # Thurstone scores (should be constant within person)
    Liberal_Score_const = first(Liberal_Score[!is.na(Liberal_Score)]),
    Procedural_Score_const = first(Procedural_Score[!is.na(Procedural_Score)]),
    Distributive_Score_const = first(Distributive_Score[!is.na(Distributive_Score)]),
    
    .groups = "drop"
  )

# Merge back with choice data
imce_data <- model_data_fixed %>%
  select(ResponseId, task, profile, selected, all_of(new_attributes)) %>%
  left_join(respondent_demographics, by = "ResponseId") %>%
  # Ensure selected is integer
  mutate(selected = as.integer(selected))

# Check for any within-person variation (should be zero)
variation_check <- imce_data %>%
  group_by(ResponseId) %>%
  summarise(
    age_var = var(age_const, na.rm = TRUE),
    educ_var = var(educ_const, na.rm = TRUE),
    liberal_var = var(Liberal_Score_const, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  summarise(
    max_age_var = max(age_var, na.rm = TRUE),
    max_educ_var = max(educ_var, na.rm = TRUE),
    max_liberal_var = max(liberal_var, na.rm = TRUE)
  )

cat("Within-person variation check (should be 0):\n")
print(variation_check)

#===============================================================================
# STEP 3: Run Fixed AMCE Analysis
#===============================================================================

cat("\nStep 3: Running fixed AMCE analysis...\n")

# Now AMCE should work without duplicate factor issues
fixed_amce <- cj(
  data = model_data_fixed,
  formula = selected ~ Econ_Wellbeing + Press_Freedom + Rule_Law + Presidential_Role,
  id = ~ResponseId,
  estimate = "amce"
)

cat("✅ AMCE completed successfully!\n")

# Display Economic Wellbeing results
econ_amce_results <- fixed_amce %>% 
  filter(feature == "Econ_Wellbeing") %>%
  select(feature, level, estimate, std.error, lower, upper)

cat("\nFixed AMCE Results for Economic Wellbeing:\n")
print(econ_amce_results)

#===============================================================================
# STEP 4: Refit CJBART with Fixed Data Structure
#===============================================================================

cat("\nStep 4: Refitting CJBART with fixed data structure...\n")

# Define covariates that are constant within person
cjbart_covariates <- c("age_const", "educ_const", "ideo_const", "polint_const", 
                      "race_binary_const", "Liberal_Score_const", 
                      "Procedural_Score_const", "Distributive_Score_const")

# Prepare data for CJBART
cjbart_data <- imce_data %>%
  # Remove any remaining NA values
  filter(
    !is.na(selected) & 
    !is.na(Econ_Wellbeing) & !is.na(Press_Freedom) & 
    !is.na(Rule_Law) & !is.na(Presidential_Role) &
    !is.na(ResponseId)
  ) %>%
  # Ensure all factor levels are present
  mutate(
    Econ_Wellbeing = factor(Econ_Wellbeing, levels = c("Econ_Normative", "Econ_Majority", "Econ_Minority")),
    Press_Freedom = factor(Press_Freedom, levels = c("Press_Normative", "Press_Majority", "Press_Minority")),
    Rule_Law = factor(Rule_Law, levels = c("Law_Normative", "Law_Majority", "Law_Minority")),
    Presidential_Role = factor(Presidential_Role, levels = c("Pres_Normative", "Pres_Majority", "Pres_Minority"))
  )

cat("CJBART data dimensions:", nrow(cjbart_data), "rows,", length(unique(cjbart_data$ResponseId)), "respondents\n")

# Fit CJBART with fixed data structure
fixed_cjbart_model <- cjbart(
  data = cjbart_data,
  Y = "selected",
  id = "ResponseId", 
  type = "choice",
  cores = 6,
  ntree = 500,
  nskip = 10000,
  ndpost = 10000,
  keepevery = 10,
  keeptrainfits = TRUE,
  usequants = TRUE,
  printevery = 1000
)

cat("✅ CJBART refit completed successfully!\n")

#===============================================================================
# STEP 5: Calculate Fixed IMCEs
#===============================================================================

cat("\nStep 5: Calculating fixed IMCEs...\n")

# Define reference levels for unique factor names
ref_levels_fixed <- c(
  "Econ_Normative", "Press_Normative", 
  "Law_Normative", "Pres_Normative"
)
names(ref_levels_fixed) <- new_attributes

cat("Reference levels:", paste(names(ref_levels_fixed), "=", ref_levels_fixed, collapse = ", "), "\n")

# Calculate IMCEs with fixed structure
fixed_imce_results <- IMCE(
  data = cjbart_data,
  model = fixed_cjbart_model,
  attribs = new_attributes,
  ref_levels = ref_levels_fixed,
  method = "bayes",
  alpha = 0.05,
  cores = 1
)

cat("✅ IMCE calculation completed successfully!\n")


#===============================================================================
# LOAD REQUIRED OBJECTS FOR VIMP CALCULATION
#===============================================================================

library(cjbart)
library(dplyr)

cat("Loading required objects for VIMP calculation...\n\n")

# Load the CJBART model
fixed_cjbart_model <- readRDS("final_fixed_cjbart.rds")
cat("✅ Loaded: final_fixed_cjbart.rds\n")

# Load the data used for CJBART
model_data_fixed <- readRDS("final_fixed_data.rds")
cat("✅ Loaded: final_fixed_data.rds\n")

# Recreate cjbart_data (the cleaned dataset used for modeling)
new_attributes <- c("Econ_Wellbeing", "Press_Freedom", "Rule_Law", "Presidential_Role")

# Create respondent-level demographics
respondent_demographics <- model_data_fixed %>%
  group_by(ResponseId) %>%
  summarise(
    age_const = first(age[!is.na(age)]),
    educ_const = first(educ[!is.na(educ)]),
    ideo_const = first(ideo[!is.na(ideo)]),
    polint_const = first(polint[!is.na(polint)]),
    race_binary_const = first(race_binary[!is.na(race_binary)]),
    Liberal_Score_const = first(Liberal_Score[!is.na(Liberal_Score)]),
    Procedural_Score_const = first(Procedural_Score[!is.na(Procedural_Score)]),
    Distributive_Score_const = first(Distributive_Score[!is.na(Distributive_Score)]),
    .groups = "drop"
  )

# Recreate the CJBART data
cjbart_data <- model_data_fixed %>%
  select(ResponseId, task, profile, selected, all_of(new_attributes)) %>%
  left_join(respondent_demographics, by = "ResponseId") %>%
  mutate(selected = as.integer(selected)) %>%
  filter(
    !is.na(selected) & 
    !is.na(Econ_Wellbeing) & !is.na(Press_Freedom) & 
    !is.na(Rule_Law) & !is.na(Presidential_Role) &
    !is.na(ResponseId)
  )

cat("✅ Recreated: cjbart_data\n")
cat("\nReady to run VIMP calculation!\n\n")

#===============================================================================
# STEP 6.5: Calculate and Save VIMP Results (Using het_vimp with IMCEs)
#===============================================================================

cat("Step 6.5: Calculating Variable Importance (VIMP)...\n")
cat("Loading IMCE results...\n")

# Load the IMCE results (needed for het_vimp)
fixed_imce_results <- readRDS("final_fixed_imce.rds")
cat("✅ Loaded: final_fixed_imce.rds\n")

cat("Calculating VIMP - this may take several minutes...\n")

# Use het_vimp with the IMCE results
vimp_results <- het_vimp(
  fit = fixed_cjbart_model,
  imces = fixed_imce_results,
  attribs = new_attributes,
  covars = c("Liberal_Score_const", "Procedural_Score_const", 
             "Distributive_Score_const", "educ_const", 
             "polint_const", "ideo_const", "age_const")
)

cat("✅ VIMP calculation completed!\n")

# Package results in the format expected by the figure script
vimp_package <- list(
  results = vimp_results,
  calculation_date = Sys.time(),
  model_info = list(
    ntree = 500,
    nskip = 10000,
    ndpost = 10000
  )
)

# Save the results
saveRDS(vimp_package, "final_vimp_results.rds")
cat("✅ Saved: final_vimp_results.rds\n")
#===============================================================================
# STEP 7: Save Fixed Results (UPDATED)
#===============================================================================

cat("\nStep 7: Saving fixed results...\n")

# Save all fixed results
saveRDS(model_data_fixed, "final_fixed_data.rds")
saveRDS(fixed_amce, "final_fixed_amce.rds")
saveRDS(fixed_cjbart_model, "final_fixed_cjbart.rds")
saveRDS(fixed_imce_results, "final_fixed_imce.rds")
saveRDS(vimp_package, "final_vimp_results.rds")  # <-- NEW LINE

cat("✅ All fixed results saved successfully!\n")
cat("\nFiles created:\n")
cat("  - final_fixed_data.rds\n")
cat("  - final_fixed_amce.rds\n")
cat("  - final_fixed_cjbart.rds\n")
cat("  - final_fixed_imce.rds\n")
cat("  - final_vimp_results.rds\n")  # <-- NEW LINE




```



